Parse Twitter Data

  1. Import retrieved tweets (from JSON file, pickle or similar)
  2. Read in individual tweets
  3. Create TSV file (and drop unwanted data)

Jupyter Notebook Style

Let's make this thing look nice.


In [2]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[2]:

Get Data and Enrich It


In [3]:
import sys,re,json,os,csv
import numpy as np
import cPickle as pickle
import uuid
from IPython.display import display_javascript, display_html, display

Read JSON or Pickle File with Tweets

  • Example pickle, Mac: /Users/[username]/Documents/twitter-analysis/data/raw/tweets.p

In [4]:
picklepath = '/Users/rcn/Desktop/twitter-analysis/data/raw/tweets.p'

In [16]:
tweets = pickle.load( open(picklepath, "rb" ) )

Number of Tweets


In [17]:
print('We have %d tweets in total' % len(tweets))


We have 6727 tweets in total

What Does a Tweet Look Like?

Let's make JSON look nice (with thanks to Renderjson)


In [18]:
class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

In [19]:
RenderJSON(tweets[0])


Get Rid of Line Breaks in Tweets


In [20]:
tweetLinebreakError=0

for tweet in tweets:
  try:
    tweet['text'] = tweet['text'].replace('\n', ' ').replace('\r', '')
  except:
    tweetLinebreakError+=1
    tweet['text'] = 'NaN'

print('Failed removing line breaks in %d tweets' % tweetLinebreakError)


Failed removing line breaks in 0 tweets

Save Data to Disk

Setup Local Paths

Paths on your machine to the file you'd like write to.

  • Example tsv, Mac: /Users/[username]/Documents/twitter-analysis/data/tweets.tsv
  • Example pickle, Mac: /Users/[username]/Documents/twitter-analysis/data/tweets.p

In [ ]:
jsonpath = '' # Path to JSON file
picklepath = '' # Path to pickle file
tsvpath = '/Users/rcn/Desktop/twitter-analysis/data/tweets.tsv' # Path to tsv file

Save as JSON


In [11]:
with open(jsonpath, 'wb') as tweetsfile: # Get ready to write to output file
    json.dump(tweets, tweetsfile) # Write tweets to json file

Save as Pickle file


In [12]:
with open(picklepath, "wb") as tweetsfile:
    pickle.dump(tweets, tweetsfile) # Write tweets to pickle file

Save as TSV


In [22]:
header=['Tweet ID','Time','User','Username','Text','Language','User Location','Geo','Place','Likes','Retweets',
        'Followers','Friends','Listed','Favourites','Hashtags','Mentions','Links','User Description']
outFile=csv.writer(open(tsvpath,'wb'),delimiter='\t')
outFile.writerow(header)

In [23]:
nIdError = 0
nDateError = 0
nNameError = 0
nScreenNameError = 0
nTextError = 0
nLanguageError = 0
nLocationError = 0
nGeoError = 0
nPlaceError = 0
nLikesError = 0
nRetweetsError = 0
nFollowersError = 0
nFriendsError = 0
nListedError = 0
nFavouritesError = 0
nTagsError = 0
nMentionsError = 0
nLinksError = 0
nDescriptionError = 0

documents=[]

for tweet in tweets:
  outList=[]
  try:
    outList.append(tweet['id'])
    documents.append(tweet['id'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nIdError+=1
  try:
    outList.append(tweet['created_at'])
    documents.append(tweet['created_at'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nDateError+=1
  try:
    outList.append(tweet['user']['name'].encode('utf-8'))
    documents.append(tweet['user']['name'].encode('utf-8'))
  except:
    nNameError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    outList.append(tweet['user']['screen_name'])
    documents.append(tweet['user']['screen_name'])
  except:
    nScreenNameError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    outList.append(tweet['text'].encode('utf-8'))
    documents.append(tweet['text'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nTextError+=1
  try:
    outList.append(tweet['lang'])
    documents.append(tweet['lang'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLanguageError+=1
  try:
    outList.append(tweet['user']['location'].encode('utf-8'))
    documents.append(tweet['user']['location'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLocationError+=1
  try:
    outList.append(tweet['geo'].encode('utf-8'))
    documents.append(tweet['geo'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nGeoError+=1
  try:
    outList.append(tweet['place'].encode('utf-8'))
    documents.append(tweet['place'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nPlaceError+=1
  try:
    outList.append(tweet['favorite_count'])
    documents.append(tweet['favorite_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLikesError+=1
  try:
    outList.append(tweet['retweet_count'])
    documents.append(tweet['retweet_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nRetweetsError+=1
  try:
    outList.append(tweet['user']['followers_count'])
    documents.append(tweet['user']['followers_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFollowersError+=1
  try:
    outList.append(tweet['user']['friends_count'])
    documents.append(tweet['user']['friends_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFriendsError+=1
  try:
    outList.append(tweet['user']['listed_count'])
    documents.append(tweet['user']['listed_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nListedError+=1
  try:
    outList.append(tweet['user']['favourites_count'])
    documents.append(tweet['user']['favourites_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFavouritesError+=1
  try:
    tweetTags=','.join([h.lower() for h in tweet['entities']['hashtags']])
    outList.append(tweetTags.decode('utf-8'))
    documents.append(tweetTags.decode('utf-8'))
  except:
    nTagsError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    tweetMentions=','.join([m.lower() for m in tweet['entities']['user_mentions']])
    outList.append(tweetMentions.decode('utf-8'))
    documents.append(tweetMentions.decode('utf-8'))
  except:
    nMentionsError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    tweetLinks=','.join([m.lower() for m in tweet['entities']['urls']])
    outList.append(tweetLinks.decode('utf-8'))
    documents.append(tweetLinks.decode('utf-8'))
  except:
    nLinksError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    outList.append(tweet['user']['description'].encode('utf-8'))
    documents.append(tweet['user']['description'].encode('utf-8'))
  except:
    nDescriptionError+=1
    outList.append('NaN')
    documents.append('NaN')
    
            
  outFile.writerow(outList)

print "%d ID errors." % nIdError
print "%d date errors." % nDateError
print "%d name errors." % nNameError
print "%d screen name errors." % nScreenNameError
print "%d text errors." % nTextError
print "%d language errors." % nLanguageError
print "%d user location errors." % nLocationError
print "%d tweet geo errors." % nGeoError
print "%d tweet place errors." % nPlaceError
print "%d likes errors." % nLikesError
print "%d retweets errors." % nRetweetsError
print "%d followers errors." % nFollowersError
print "%d friends errors." % nFriendsError
print "%d listed errors." % nListedError
print "%d favourites errors." % nFavouritesError
print "%d hashtag errors." % nTagsError
print "%d mention errors." % nMentionsError
print "%d link errors." % nLinksError
print "%d Description errors." % nDescriptionError


0 ID errors.
0 date errors.
0 name errors.
0 screen name errors.
0 text errors.
0 language errors.
0 user location errors.
6727 tweet geo errors.
6727 tweet place errors.
0 likes errors.
0 retweets errors.
0 followers errors.
0 friends errors.
0 listed errors.
0 favourites errors.
5323 hashtag errors.
3076 mention errors.
2039 link errors.
0 Description errors.

In [ ]: